{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import json\n",
    "import jsonlines\n",
    "from collections import defaultdict\n",
    "import random\n",
    "from tqdm import tqdm\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Statistic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = defaultdict(list)\n",
    "test_data = defaultdict(list)\n",
    "\n",
    "for i in range(5):\n",
    "    with jsonlines.open(f'train_{i}.jsonl') as f:\n",
    "        for line in f.iter():\n",
    "            train_data[i].append(line)\n",
    "\n",
    "    with jsonlines.open(f'test_{i}.jsonl') as f:\n",
    "        for line in f.iter():\n",
    "            test_data[i].append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "conditions = defaultdict(list)\n",
    "\n",
    "for data in train_data[0]:\n",
    "    data = json.loads(data['messages'][2]['content'])\n",
    "\n",
    "    for key, value in data.items():\n",
    "        if key == 'precursors':\n",
    "            continue\n",
    "        \n",
    "        if isinstance(value, list):\n",
    "            value = tuple(sorted(value))\n",
    "\n",
    "        conditions[key].append(value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "condition_set = {key: list(set(value)) for key, value in conditions.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "sl_counter = Counter()\n",
    "for i in condition_set['solvent']:\n",
    "    sl_counter.update(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['water', 'DMF', 'MeOH', 'EtOH', 'acetonitrile', 'DMSO', 'DMA', 'CH2Cl2', 'CHCl3', 'THF', 'toluene', 'Et3N', 'acetone', 'DEF', 'benzene', 'pyridine', 'isopropanol', 'NMP', 'ethylene glycol', 'diethyl ether', 'AcOH', 'dioxane', 'DMAC', 'hexane', 'formic acid', 'cyclohexane', 'BuOH', 'TEA', 'Ethanol', 'nitrobenzene', 'DMI', 'ethyl acetate', 'DMAc', 'DME', 'isobutanol', 'TBA', 'TPA', 'NMA', 'DMAE', 'dichlorobenzene', 'DEA', 'formamide']\n"
     ]
    }
   ],
   "source": [
    "print ([i for i, v in sl_counter.most_common()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_random_prediction(true, condition=condition_set):\n",
    "    true = json.loads(data['messages'][2]['content'])\n",
    "    prediction = dict()\n",
    "    for key, value in true.items():\n",
    "        if key == 'precursors':\n",
    "            prediction[key] = value\n",
    "        else:\n",
    "            prediction[key] = random.choice(condition[key])\n",
    "    \n",
    "    return true, prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2094/2094 [00:00<00:00, 156023.46it/s]\n"
     ]
    }
   ],
   "source": [
    "true_label, prediction_label = [], []\n",
    "\n",
    "for data in tqdm(test_data[0]):\n",
    "    try:\n",
    "        true, prediction = make_random_prediction(data, conditions)\n",
    "    except Exception as e:\n",
    "        print (e)\n",
    "        continue\n",
    "    else:\n",
    "        true_label.append(true)\n",
    "        prediction_label.append(prediction)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('true_label_random_feq_save.pickle', 'wb') as f:\n",
    "    pickle.dump(true_label, f)\n",
    "\n",
    "with open('prediction_label_random_feq_save.pickle', 'wb') as f:\n",
    "    pickle.dump(prediction_label, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Full Random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_random_solvent(washing=False):\n",
    "    solvents = list(sl_counter.keys())\n",
    "    if washing:\n",
    "        n_solvent = random.randint(0, 5)\n",
    "    else:\n",
    "        n_solvent = random.randint(1, 5)\n",
    "\n",
    "    sol = random.sample(solvents, n_solvent)\n",
    "    if not sol and washing:\n",
    "        return False\n",
    "    else:\n",
    "        return sol\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_list = [float(i.replace('°C', '').strip()) for i in conditions['temperature'] if i and '°C' in i]\n",
    "temp_max, temp_min = np.max(temp_list), np.min(temp_list)\n",
    "\n",
    "def make_random_temp():\n",
    "    temp = random.uniform(temp_min, temp_max)\n",
    "    return f'{temp:.1f} °C'\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_pressure = list(set(conditions['pressure']))\n",
    "\n",
    "def make_random_pressure():\n",
    "    return random.choice(list_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['0 atm', '1 atm', 'autogenous']"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_pressure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(set(conditions['time']))\n",
    "\n",
    "def make_random_time():\n",
    "    unit = random.choice(['min', 'h', 'days', 'week'])\n",
    "\n",
    "    if unit == 'min':\n",
    "        value = random.uniform(0, 60)\n",
    "    elif unit == 'h':\n",
    "        value = random.uniform(0, 24)\n",
    "    elif unit == 'days':\n",
    "        value = random.uniform(0, 7)\n",
    "    elif unit == 'week':\n",
    "        value = random.uniform(0, 3)\n",
    "\n",
    "    return f'{value} {unit}'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_random_syn_method():\n",
    "    return random.choice(['chemical synthesis', 'solvothermal synthesis', 'sonochemical synthesis', 'hydrothermal synthesis'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_random_dataset(data):\n",
    "    true = json.loads(data['messages'][2]['content'])\n",
    "    precursor = true['precursors']\n",
    "    prediction = {\n",
    "        'precursors': precursor,\n",
    "        'synthesis_method': make_random_syn_method(),\n",
    "        'solvent': make_random_solvent(),\n",
    "        'temperature': make_random_temp(),\n",
    "        'time': make_random_time(),\n",
    "        'pressure': make_random_pressure(),\n",
    "        'cooling': random.choice([True, False]),\n",
    "        'pH_adjustment': random.choice([True, False]),\n",
    "        'washing': make_random_solvent(washing=True),\n",
    "        'filtration': random.choice([True, False]),\n",
    "        'drying': random.choice([True, False]),\n",
    "\n",
    "    }\n",
    "    return true, prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2094/2094 [00:00<00:00, 95507.53it/s]\n"
     ]
    }
   ],
   "source": [
    "true_label, prediction_label = [], []\n",
    "\n",
    "for data in tqdm(test_data[0]):\n",
    "    try:\n",
    "        true, prediction = make_random_dataset(data)\n",
    "    except Exception as e:\n",
    "        print (e)\n",
    "        continue\n",
    "    else:\n",
    "        true_label.append(true)\n",
    "        prediction_label.append(prediction)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('true_label_random_save.pickle', 'wb') as f:\n",
    "    pickle.dump(true_label, f)\n",
    "\n",
    "with open('prediction_label_random_save.pickle', 'wb') as f:\n",
    "    pickle.dump(prediction_label, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llmminer",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
